#Amy Finnegan
#R file for health study replication
#ordered by variables you need to create, descriptive statistics by model, bivariate relationships
#models in order: Spending as percent GDP, Total Spending, HALE with their corresponding cross-validation
#quality of government variable created at the end

library(memisc)
library(tonymisc)
library(AER)
library(xtable)
library(nlme)
library(ggplot2)
library(arm)
library(MASS)
library(pastecs)
library(gplots)
library(apsrtable)
library(calibrate)
library(plyr)


set.seed(1234)

#set working directory
setwd("C:\\Users\\Owner\\Desktop\\Spring 2012\\Duke Spring 2012\\PoliSci 303\\Project\\Health Political Economy")

health<- load("Huthrussettghobarah_data_article5.RData")

health<-x
summary(health)
ls(health)

#Create a few things you will need first


#creating the total spending variable
health$total98log<- as.numeric(X._K_total98_log)



#creating the WHO regions 1 to 5 Africa, Americas, EastMediteranean, Europe, Asia Pacific
attach(health)
WHOregion<- as.numeric(Region_ID)
WHOregion[Region_ID <= 2]<- 1
WHOregion[Region_ID >= 3 & Region_ID <= 5]<- 2
WHOregion[Region_ID >= 6 & Region_ID <=7]<- 3
WHOregion[Region_ID >= 8 & Region_ID <=10]<- 4
WHOregion[Region_ID >= 11]<- 5
WHOregion

##descriptive statistics by model

#Model 1
#making a table of descriptive statistics using pastecs package
attach(health)
vars<- cbind(pub_exp_prcnt_gdp97who, cia_Log_GDP_PerCapita, WHOGini1997, log_Educational_Attainment, log_vanhanen, any_89_Endrng_rivalry1, POLITY96_97)  
options(scipen=100)
options(digits=2)
descriptives<- stat.desc(vars)
xtable(descriptives)

#Model 2
#making a table of descriptive statistics using pastecs package
attach(health)
vars<- cbind(total98log, cia_Log_GDP_PerCapita, log_Educational_Attainment,  prvt_exp_rcnt_gdp, pub_98_gdp_prcnt)
options(scipen=100)
options(digits=2)
descriptives<- stat.desc(vars)
xtable(descriptives)

# Model 3
#making a table of descriptive statistics using pastecs package 
attach(health)
vars<- cbind(HALE_2000, total98log, GROWTH_URBAN9095_POP_UN, WHOGini1997, log_vanhanen, log_Educational_Attainment,  deathn91_97, contig_civil_war)
options(scipen=100)
options(digits=2)
descriptives<- stat.desc(vars)
xtable(descriptives)



##Bivariate relationship plots

attach(health)
plot(POLITY96_97, HALE_2000, pch=19, main="Relationship between Democracy and HALE", xlab="Polity Score", ylab="HALE 2000", col=WHOregion)
abline(lm(HALE_2000~POLITY96_97))
textxy(POLITY96_97, HALE_2000, ID)



#bivariate relationship
plot(cia_Log_GDP_PerCapita, pub_exp_prcnt_gdp97who)
abline(lm(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita))

#demand curve, upward sloping so that more HALE when it is costlier
#colored by region ID
plot(HALE_2000, pub_exp_prcnt_gdp97who, col=Region_ID)
abline(lm(pub_exp_prcnt_gdp97who~ HALE_2000))

#colored by Africa
plot(HALE_2000, pub_exp_prcnt_gdp97who, col=Africa)
abline(lm(pub_exp_prcnt_gdp97who~ HALE_2000))

#coloared by Americas
plot(HALE_2000, pub_exp_prcnt_gdp97who, col=Americas)
abline(lm(pub_exp_prcnt_gdp97who~ HALE_2000, subset=health$Americas))


#coloared by Europe
plot(HALE_2000, pub_exp_prcnt_gdp97who, col=Europe)
abline(lm(pub_exp_prcnt_gdp97who~ HALE_2000))



###First Model: Health as % of GDP###  WORKS!!#

#expenditures by region
boxplot(pub_exp_prcnt_gdp97who, total98log, log_Educational_Attainment)
boxplot(pub_exp_prcnt_gdp97who~WHOregion, xlab="Regions", ylab="Public Expenditures on Health as a % of GDP per capita", main="Public Expenditures on Health as a % of GDP per Capita by Region")
#pdf("By_Region.pdf", h= 6.5, w = 6.5)

#HALE by Region
boxplot(HALE_2000~WHOregion, xlab="Regions",ylab="Health Adjusted Life Expectancy, HALE", main="HALE by Region")
#pdf("HALE_by_Region.pdf", h = 6.5, w = 6.5)

#polity by region
boxplot(POLITY96_97~WHOregion, xlab="Regions", ylab="Polity IV", main="Polity Score by Region")


attach(health)

health$total_97_log<- X._K_public97_log
health$total_97<- as.numeric(X._K_public97)

summary(pub_exp_prcnt_gdp97who)
pub_exp_prcnt_gdp97who


##Model 1: WORKS!##
lm.out<- lm(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita  + WHOGini1997 + log_Educational_Attainment + log_vanhanen +  any_89_Endrng_rivalry1 + POLITY96_97)
summary(lm.out)
xtable(lm.out,digits=3)
#apsrtable(lm.out, lm.out2, lm.out3)
lm.out$fitted.values
plot(lm.out)

#coplot(HALE_2000~pub_exp_prcnt_gdp97who|WHOregion)


##plotted fitted vs. observed

#yhat<- lm.out$fitted.values
#y<- pub_exp_prcnt_gdp97who[pub_exp_prcnt_gdp97who[3:182,]]
#yhaty<- cbind(yhat, y)
#plot(y, yhat)

#barchart by region
barchart(HALE_2000~WHOregion)

length(y)
length(yhat)

##coefplot for model 1

par(mfrow=c(1,1))

coefplot(lm.out, var.idx=NULL, varnames=c("I", "Polity IV", "Enduring Rivalry", "ELF", "Educ", "Gini", "log GDP per capita"), 
            CI=2, vertical=TRUE,
            v.axis=TRUE, h.axis=TRUE, 
            cex.var=0.8, cex.pts=0.9, 
            col.pts=1, pch.pts=20, var.las=2, 
            main="DV: Health Spending as % of GDP", xlab=NULL, ylab=NULL, 
            plot=TRUE, add=FALSE, offset=.1,
            mar=c(1,6,5.1,2))

#pdf("coefplot1.pdf", h = 6.5, w = 6.5)


#interaction of level of democracy and education attainment
lm.outi<- lm(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita  + WHOGini1997 + log_Educational_Attainment + log_vanhanen +  any_89_Endrng_rivalry1 + POLITY96_97 + POLITY96_97:log_Educational_Attainment)
summary(lm.outi)

##glm to use in the simulations
glm.out<- glm(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita  + WHOGini1997 + log_Educational_Attainment + log_vanhanen +  any_89_Endrng_rivalry1 + POLITY96_97, family=gaussian)
summary(glm.out)

##gls.out
attach(health)
gls.fit<- gls(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita  + WHOGini1997 + log_Educational_Attainment + log_vanhanen +  any_89_Endrng_rivalry1 + POLITY96_97,
correlation=corAR1(.99),
method='ML',na.action=na.omit)
summary(gls.fit)
toLatex(mtable(gls.fit))

##some simulations using the gls.fit model

#democracy scenarios

#high democracy, low per capita gdp
highlow<-c(1,min(cia_Log_GDP_PerCapita,na.rm=T),
mean(WHOGini1997,na.rm=T),mean(log_Educational_Attainment,na.rm=T),
mean(log_vanhanen,na.rm=T),mean(as.numeric(any_89_Endrng_rivalry1),na.rm=T),
max(POLITY96_97,na.rm=T))

#min democracy, low per capita gdp
lowlow<-c(1,min(cia_Log_GDP_PerCapita,na.rm=T),
mean(WHOGini1997,na.rm=T),mean(log_Educational_Attainment,na.rm=T),
mean(log_vanhanen,na.rm=T),mean(as.numeric(any_89_Endrng_rivalry1),na.rm=T),
min(POLITY96_97,na.rm=T))

#mean democracy, low per capita gdp
meanlow<-c(1,min(cia_Log_GDP_PerCapita,na.rm=T),
mean(WHOGini1997,na.rm=T),mean(log_Educational_Attainment,na.rm=T),
mean(log_vanhanen,na.rm=T),mean(as.numeric(any_89_Endrng_rivalry1),na.rm=T),
mean(POLITY96_97,na.rm=T))

#draws from the mvnorm


nsims<-1000

draws1<-mvrnorm(n = nsims, mu=gls.fit$coefficients,
Sigma=vcov(gls.fit))


highlowresult<- draws1%*%highlow
lowlowresult<- draws1%*%lowlow
meanlowresult<- draws1%*%meanlow


plot(density(highlowresult),xlim=c(-1,4), ylim=c(0,1.5), xlab="Health Spending as a % of GDP",ylab="",las=1,main="Simulated Outcomes for High and Low Democracy",
bty="n",lwd=3,col="red")
lines(density(lowlowresult),lwd=3,col="blue" )
lines(density(meanlowresult), lwd=3, col="black")

legend(x="topright",col=c("blue","black", "red"),lty=c(1,1,1), lwd=c(3,3,3),
legend=c("Low", "Mean", "High"))
box()

##adding the stochastic error

###get a point estimate to center your stochastic rnorm on

draw1<-mvrnorm(n = 1, mu=gls.fit$coefficients,
Sigma=vcov(gls.fit))

outcomeHighLow<- highlow%*%draw1
outcomeHighLow

outcomeLowLow<- lowlow%*%draw1
outcomeLowLow

outcomeMeanLow<- meanlow%*%draw1


#do a bunch of draws
#set the number of draws

nsims<-1000

draws1<-mvrnorm(n = nsims, mu=gls.fit$coefficients,
Sigma=vcov(gls.fit))

sig<- gls.fit$sigma

###making a loop###

resultHighLow<- draws1%*%highlow
resultLowLow<- draws1%*%lowlow
resultMeanLow<- draws1%*%meanlow

outcomeA<-c()

i<- resultLowLow[i]
for (i in length(resultLowLow)) {
NewValues<- rnorm(1000, outcomeLowLow, sig)
outcomeA<- c(outcomeA, NewValues)
}

outcomeB<-c()
i<- resultHighLow[i]
for (i in length(resultHighLow)) {
NewValues<- rnorm(1000, outcomeHighLow, sig)
outcomeB<- c(outcomeB, NewValues)
}


outcomeC<-c()
i<- resultMeanLow[i]
for (i in length(resultMeanLow)) {
NewValues<- rnorm(1000, outcomeMeanLow, sig)
outcomeC<- c(outcomeC, NewValues)
}

outcomeA
outcomeB
outcomeC

#result14 and rnorm14 are equal, result15 and rnorm15 are unequal
plot(density(resultHighLow),xlim=c(-3,8), ylim=c(0,1), xlab="Spending on health as a % of GDP",ylab="",las=1,main="",
bty="n",lwd=3,col="red")
lines(density(resultLowLow),lwd=3,col="blue" )
lines(density(resultMeanLow), lwd=3, col="black")
lines(density(outcomeA), lty=2, lwd=1, col="blue") #LowLow
lines(density(outcomeB), lty=2, lwd=1, col="red") #HighLow
lines(density(outcomeC), lty=2, lwd=1, col="black") #MeanLow

legend(x="topright",col=c("red","black","blue","black"),lty=c(1,1,1,2), lwd=c(3,3,3,1),
legend=c("High", "Mean", "LowLow", "Stochastic Error"))
box()
title("Democracy Simulation", line=1)



#looking at low-income countries and educational attainment

#3rd ed, low per capita gdp, median democratic
top<-c(1,min(cia_Log_GDP_PerCapita,na.rm=T),
mean(WHOGini1997,na.rm=T),log_Educational_Attainment=2.083,
mean(log_vanhanen,na.rm=T),mean(as.numeric(any_89_Endrng_rivalry1),na.rm=T),
median(POLITY96_97,na.rm=T))

summary(log_Educational_Attainment)
summary(POLITY96_97)

#bottom ed, low per capita gdp, median democratic
bottom<-c(1,min(cia_Log_GDP_PerCapita,na.rm=T),
mean(WHOGini1997,na.rm=T),log_Educational_Attainment=0.03651,
mean(log_vanhanen,na.rm=T),mean(as.numeric(any_89_Endrng_rivalry1),na.rm=T),
median(POLITY96_97,na.rm=T))

#median ed, low per capita gdp, median democratic
mean<-c(1,min(cia_Log_GDP_PerCapita,na.rm=T),
mean(WHOGini1997,na.rm=T),log_Educational_Attainment=1.643,
mean(log_vanhanen,na.rm=T),mean(as.numeric(any_89_Endrng_rivalry1),na.rm=T),
median(POLITY96_97,na.rm=T))

#draws from the mvnorm


nsims<-1000

draws1<-mvrnorm(n = nsims, mu=gls.fit$coefficients,
Sigma=vcov(gls.fit))


topresult<- draws1%*%top
bottomresult<- draws1%*%bottom
meanresult<- draws1%*%mean


plot(density(topresult),xlim=c(-1,4), ylim=c(0,1.5), xlab="Health Spending as a % of GDP",ylab="",las=1,main="Simulated Outcomes for Educational attainment",
bty="n",lwd=3,col="blue")
lines(density(bottomresult),lwd=3,col="red" )
lines(density(meanresult), lwd=3, col="black")

legend(x="topright",col=c("blue","black", "red"),lty=c(1,1,1), lwd=c(3,3,3),
legend=c("High", "Median", "Low"))


##adding the stochastic error

###get a point estimate to center your stochastic rnorm on

draw1<-mvrnorm(n = 1, mu=gls.fit$coefficients,
Sigma=vcov(gls.fit))

outcometop<- top%*%draw1
outcometop

outcomebottom<- bottom%*%draw1
outcomebottom

outcomeMean<- mean%*%draw1


#do a bunch of draws
#set the number of draws

nsims<-1000

draws1<-mvrnorm(n = nsims, mu=gls.fit$coefficients,
Sigma=vcov(gls.fit))

sig<- gls.fit$sigma

###making a loop###

resulttop<- draws1%*%top
resultbottom<- draws1%*%bottom
resultmean<- draws1%*%mean

outcomeA<-c()

i<- resulttop[i]
for (i in length(resulttop)) {
NewValues<- rnorm(1000, outcometop, sig)
outcomeA<- c(outcomeA, NewValues)
}

outcomeB<-c()
i<- resultbottom[i]
for (i in length(resultbottom)) {
NewValues<- rnorm(1000, outcomebottom, sig)
outcomeB<- c(outcomeB, NewValues)
}


outcomeC<-c()
i<- resultmean[i]
for (i in length(resultmean)) {
NewValues<- rnorm(1000, outcomeMean, sig)
outcomeC<- c(outcomeC, NewValues)
}

outcomeA
outcomeB
outcomeC

#result14 and rnorm14 are equal, result15 and rnorm15 are unequal
plot(density(resultbottom),xlim=c(-3,8), ylim=c(0,1), xlab="Spending on health as a % of GDP",ylab="",las=1,main="",
bty="n",lwd=3,col="red")
lines(density(resulttop),lwd=3,col="blue" )
lines(density(resultmean), lwd=3, col="black")
lines(density(outcomeA), lty=2, lwd=1, col="blue") #top
lines(density(outcomeB), lty=2, lwd=1, col="red") #bottom
lines(density(outcomeC), lty=2, lwd=1, col="black") #median

legend(x="topright",col=c("Blue","black","red","black"),lty=c(1,1,1,2), lwd=c(3,3,3,1),
legend=c("Top", "Median", "Bottom", "Stochastic Error"))

title("Educational Attainment Simulation", line=1)
box()



#bivariate relationship between spending and HALE
plot(pub_exp_prcnt_gdp97who, HALE_2000, col=WHOregion, pch=19, main="Relationship between spending on health and health outcomes", ylab="HALE",xlab="Spending on Health as % of GDP")
abline(lm(HALE_2000~ pub_exp_prcnt_gdp97who))

textxy(pub_exp_prcnt_gdp97who, HALE_2000, ID)

legend(x="bottomright",col=c("black","red", "green", "blue", "cyan"), pch=19,
legend=c("Africa","Americas", "MidEast", "Europe", "AsiaPac"))

##simulations

#democracy scenarios

#high democracy, low per capita gdp
highlow<-c(1,min(cia_Log_GDP_PerCapita,na.rm=T),
mean(WHOGini1997,na.rm=T),mean(log_Educational_Attainment,na.rm=T),
mean(log_vanhanen,na.rm=T),mean(as.numeric(any_89_Endrng_rivalry1),na.rm=T),
max(POLITY96_97,na.rm=T))

#min democracy, low per capita gdp
lowlow<-c(1,min(cia_Log_GDP_PerCapita,na.rm=T),
mean(WHOGini1997,na.rm=T),mean(log_Educational_Attainment,na.rm=T),
mean(log_vanhanen,na.rm=T),mean(as.numeric(any_89_Endrng_rivalry1),na.rm=T),
min(POLITY96_97,na.rm=T))

#mean democracy, low per capita gdp
meanlow<-c(1,min(cia_Log_GDP_PerCapita,na.rm=T),
mean(WHOGini1997,na.rm=T),mean(log_Educational_Attainment,na.rm=T),
mean(log_vanhanen,na.rm=T),mean(as.numeric(any_89_Endrng_rivalry1),na.rm=T),
mean(POLITY96_97,na.rm=T))

#draws from the mvnorm


nsims<-1000

draws1<-mvrnorm(n = nsims, mu=glm.out$coefficients,
Sigma=vcov(glm.out))


highlowresult<- draws1%*%highlow
lowlowresult<- draws1%*%lowlow
meanlowresult<- draws1%*%meanlow


plot(density(highlowresult),xlim=c(-1,4), ylim=c(0,1.5), xlab="Health Spending as a % of GDP",ylab="",las=1,main="Simulated Outcomes for High and Low Democracy",
bty="n",lwd=3,col="red")
lines(density(lowlowresult),lwd=3,col="blue" )
lines(density(meanlowresult), lwd=3, col="black")

legend(x="topright",col=c("blue","black", "red"),lty=c(1,1,1), lwd=c(3,3,3),
legend=c("Low", "Mean", "High"))


##adding the stochastic error

###get a point estimate to center your stochastic rnorm on

draw1<-mvrnorm(n = 1, mu=glm.out$coefficients,
Sigma=vcov(glm.out))

outcomeHighLow<- highlow%*%draw1
outcomeHighLow

outcomeLowLow<- lowlow%*%draw1
outcomeLowLow

outcomeMeanLow<- meanlow%*%draw1


#do a bunch of draws
#set the number of draws

nsims<-1000

draws1<-mvrnorm(n = nsims, mu=glm.out$coefficients,
Sigma=vcov(glm.out))

sig<- sqrt(deviance(glm.out)/df.residual(glm.out))

#residual standard error of regression
sqrt(deviance(lm.out)/df.residual(lm.out))

#residual standard error of glm object
sqrt(deviance(glm.out)/df.residual(glm.out))

###making a loop###

resultHighLow<- draws1%*%highlow
resultLowLow<- draws1%*%lowlow
resultMeanLow<- draws1%*%meanlow

outcomeA<-c()

i<- resultLowLow[i]
for (i in length(resultLowLow)) {
NewValues<- rnorm(1000, outcomeLowLow, sig)
outcomeA<- c(outcomeA, NewValues)
}

outcomeB<-c()
i<- resultHighLow[i]
for (i in length(resultHighLow)) {
NewValues<- rnorm(1000, outcomeHighLow, sig)
outcomeB<- c(outcomeB, NewValues)
}


outcomeC<-c()
i<- resultMeanLow[i]
for (i in length(resultMeanLow)) {
NewValues<- rnorm(1000, outcomeMeanLow, sig)
outcomeC<- c(outcomeC, NewValues)
}

outcomeA
outcomeB
outcomeC

#result14 and rnorm14 are equal, result15 and rnorm15 are unequal
plot(density(resultHighLow),xlim=c(-3,5), ylim=c(0,2), xlab="Spending on health as a % of GDP",ylab="",las=1,main="",
bty="n",lwd=1,col="red", ylab="Density")
lines(density(resultLowLow),lwd=1,col="blue" )
lines(density(resultMeanLow), lwd=1, col="black")
lines(density(outcomeA), lty=2, lwd=1, col="blue") #LowLow
lines(density(outcomeB), lty=2, lwd=1, col="red") #HighLow
lines(density(outcomeC), lty=2, lwd=1, col="black") #MeanLow

legend(x="topright",col=c("red","black","blue","black"),lty=c(1,1,2,2), lwd=c(1,1,1,1),
legend=c("High", "Mean", "LowLow", "Stochastic Error"))

title("Varying Equality with", line=3)
  title("Low Levels of Economic", line=2)
title("Development", line=1)


#looking at low-income countries and educational attainment

#3rd ed, low per capita gdp, median democratic
top<-c(1,min(cia_Log_GDP_PerCapita,na.rm=T),
mean(WHOGini1997,na.rm=T),log_Educational_Attainment=2.083,
mean(log_vanhanen,na.rm=T),mean(as.numeric(any_89_Endrng_rivalry1),na.rm=T),
median(POLITY96_97,na.rm=T))

summary(log_Educational_Attainment)
summary(POLITY96_97)

#bottom ed, low per capita gdp, median democratic
bottom<-c(1,min(cia_Log_GDP_PerCapita,na.rm=T),
mean(WHOGini1997,na.rm=T),log_Educational_Attainment=0.03651,
mean(log_vanhanen,na.rm=T),mean(as.numeric(any_89_Endrng_rivalry1),na.rm=T),
median(POLITY96_97,na.rm=T))

#median ed, low per capita gdp, median democratic
mean<-c(1,min(cia_Log_GDP_PerCapita,na.rm=T),
mean(WHOGini1997,na.rm=T),log_Educational_Attainment=1.643,
mean(log_vanhanen,na.rm=T),mean(as.numeric(any_89_Endrng_rivalry1),na.rm=T),
median(POLITY96_97,na.rm=T))

#draws from the mvnorm


nsims<-1000

draws1<-mvrnorm(n = nsims, mu=glm.out$coefficients,
Sigma=vcov(glm.out))


topresult<- draws1%*%top
bottomresult<- draws1%*%bottom
meanresult<- draws1%*%mean


plot(density(topresult),xlim=c(-1,4), ylim=c(0,1.5), xlab="Health Spending as a % of GDP",ylab="",las=1,main="Simulated Outcomes for Educational attainment",
bty="n",lwd=3,col="blue")
lines(density(bottomresult),lwd=3,col="red" )
lines(density(meanresult), lwd=3, col="black")

legend(x="topright",col=c("blue","black", "red"),lty=c(1,1,1), lwd=c(3,3,3),
legend=c("High", "Median", "Low"))


##adding the stochastic error

###get a point estimate to center your stochastic rnorm on

draw1<-mvrnorm(n = 1, mu=glm.out$coefficients,
Sigma=vcov(glm.out))

outcometop<- top%*%draw1
outcometop

outcomebottom<- bottom%*%draw1
outcomebottom

outcomeMean<- mean%*%draw1


#do a bunch of draws
#set the number of draws

nsims<-1000

draws1<-mvrnorm(n = nsims, mu=glm.out$coefficients,
Sigma=vcov(glm.out))

sig<- sqrt(deviance(glm.out)/df.residual(glm.out))

#residual standard error of regression
sqrt(deviance(lm.out)/df.residual(lm.out))

#residual standard error of glm object
sqrt(deviance(glm.out)/df.residual(glm.out))

###making a loop###

resulttop<- draws1%*%top
resultbottom<- draws1%*%bottom
resultmean<- draws1%*%mean

outcomeA<-c()

i<- resulttop[i]
for (i in length(resulttop)) {
NewValues<- rnorm(1000, outcometop, sig)
outcomeA<- c(outcomeA, NewValues)
}

outcomeB<-c()
i<- resultbottom[i]
for (i in length(resultbottom)) {
NewValues<- rnorm(1000, outcomebottom, sig)
outcomeB<- c(outcomeB, NewValues)
}


outcomeC<-c()
i<- resultmean[i]
for (i in length(resultmean)) {
NewValues<- rnorm(1000, outcomeMean, sig)
outcomeC<- c(outcomeC, NewValues)
}

outcomeA
outcomeB
outcomeC

#result14 and rnorm14 are equal, result15 and rnorm15 are unequal
plot(density(resultbottom),xlim=c(-3,5), ylim=c(0,1.5), xlab="Spending on health as a % of GDP",ylab="",las=1,main="",
bty="n",lwd=1,col="red")
lines(density(resulttop),lwd=1,col="blue" )
lines(density(resultmean), lwd=1, col="black")
lines(density(outcomeA), lty=2, lwd=1, col="blue") #top
lines(density(outcomeB), lty=2, lwd=1, col="red") #bottom
lines(density(outcomeC), lty=2, lwd=1, col="black") #median

legend(x="topright",col=c("red","black","blue","black"),lty=c(1,1,2,2), lwd=c(1,1,1,1),
legend=c("High", "Median", "LowLow", "Stochastic Error"))

title("Educational Attainment Simulation", line=3)
box()




##looking at six scenarios for democracy

#summarize data and find six scnerios put in a vector
summary(POLITY96_97)
scenarios<- c(-10.000,-4.000,6.000,2.945,8.000,10.000)

#make a scenario x
6by2 times 2by100

scenarioX<- cbind(1,8.25, .38,1.64,3.24,1.14, scenarios)

#take the betas and vcov of model
betas<- glm.out$coefficients
vcv<- vcov(glm.out)

6by2 and 7 by 100
#scenarios times the transpose of the multivariate draws
yhats<- scenarioX%*%t(mvrnorm(100, betas, vcv))


library(plyr)
library(ggplot2)
longY<- melt(yhats)

head(longY)

#I want to break this apart, too
LilPlot <- qplot(data = longY, x = value, geom = "density", colour = as.factor(X1), main="Democracy")
LilPlot <- LilPlot + facet_grid(X1 ~ .)
print(LilPlot)

#democracy is correlated with life expectancy





#cross validation for model 1
#draw randomly from dataset
Rand<- runif(nrow(health))
Rand



# This should help:
Quintiles <- quantile(Rand, 0:10/10)
#cut random number into those four groups
health$k<-cut(Rand, Quintiles, include.lowest=T)

table(health$k)

#relabel the factor levels
health$k<- factor(health$k, labels=LETTERS[1:10])

attach(health)

EmptyObject<- NULL
EOList<- NULL

#make it so it's not part of the regression
for(kk in LETTERS[1:10]){
Model<- lm(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita  + WHOGini1997 + log_Educational_Attainment + log_vanhanen +  any_89_Endrng_rivalry1 + POLITY96_97, data = health[health$k != kk, ])

Intercept<- Model$coef[1]
ISE<- summary(Model)$coef[1,2]
gdp <- Model$coef[2]
gdpSE<- summary(Model)$coef[2,2]
gini<- Model$coef[3]
giniSE<- summary(Model)$coef[3,2]
ed<- Model$coef[4]
edSE<- summary(Model)$coef[4,2]
vanhanen<- Model$coef[5]
vanhanenSE<- summary(Model)$coef[5,2]
rivalry<- Model$coef[6]
rivalrySE<- summary(Model)$coef[6,2]
polity<- Model$coef[7]
politySE<- summary(Model)$coef[7,2]




Yhat <- predict(Model, newdata = health[health$k == kk, ])
Y <- health[health$k == kk, "pub_exp_prcnt_gdp97who"]

RMSE<- sqrt(mean((Yhat - Y) ^ 2, na.rm = T))

#error squared, mean and take the square root, squaring it gets it back
#into the original units of  the dependent variable
#it's the loss function that was used to estimate the model

NewData<- c(kk, Intercept, ISE, gdp, gdpSE, gini, giniSE, ed, edSE, vanhanen, vanhanenSE, rivalry, rivalrySE, polity, politySE, RMSE)
EmptyObject <- rbind(EmptyObject,NewData)
#iterating through the model will add a row beneath what is there
}

warnings()
#change names
EmptyObject<- data.frame(EmptyObject)
EmptyObject


coefs<- 



colnames(EmptyObject) <- c("k", "Intercept", "ISE", "gdp","gdpSE", "gini","giniSE", "ed", "edSE", "vanhanen","vanhanenSE", "rivalry","rivalrySE", "polity", "politySE", "RMSE")

EmptyObject$Intercept<- as.numeric(as.character(EmptyObject$Intercept))
EmptyObject$gdp<- as.numeric(as.character(EmptyObject$gdp))
EmptyObject$gini<- as.numeric(as.character(EmptyObject$gini))
EmptyObject$ed<- as.numeric(as.character(EmptyObject$ed))
EmptyObject$vanhanen<- as.numeric(as.character(EmptyObject$vanhanen))
EmptyObject$rivalry<- as.numeric(as.character(EmptyObject$rivalry))
EmptyObject$polity<- as.numeric(as.character(EmptyObject$polity))
EmptyObject$SE<- as.numeric(as.character(EmptyObject$SE))
EmptyObject$RSME<- as.numeric(as.character(EmptyObject$RSME))


#trying to get them all in a table, preferably xtable with coefs, ses, rsme
 
coefs<- data.frame(k= LETTERS[1:10])
coefs$Intercept<- as.numeric(as.character(EmptyObject[,2]))
rbind(coefs[1,], ses[1,], ...



kk, Intercept, gdp, gini, ed, vanhanen, rivalry, polity)
ses<- data.frame(kk, ISE, gdpSE, giniSE, edSE, vanhanenSE, rivalrySE, politySE)

models<- data.frame(coefs, ses)

xtable(data.frame(models))

xtable(EmptyObject, digits=2)

#Going through the new data set with a vector of letters and subsetting and regressing

#first sampled subset
notA <- health[health$k %in% c("B", "C", "D", "E", "F", "G", "H", "I", "J"), ]
notA

attach(notA)
notA.out<- glm(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita  + WHOGini1997 + log_Educational_Attainment + log_vanhanen +  any_89_Endrng_rivalry1 + POLITY96_97, family=gaussian)
summary(notA.out)

#second sampled subset
notB <- health[health$k %in% c("A", "C", "D", "E", "F", "G", "H", "I", "J"), ]
notB

notB.out<- glm(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita  + WHOGini1997 + log_Educational_Attainment + log_vanhanen +  any_89_Endrng_rivalry1 + POLITY96_97, family=gaussian)
summary(notB.out)

#third sampled subset
notC <- health[health$k %in% c("A", "B", "D", "E", "F", "G", "H", "I", "J"), ]
notC

notC.out<- glm(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita  + WHOGini1997 + log_Educational_Attainment + log_vanhanen +  any_89_Endrng_rivalry1 + POLITY96_97, family=gaussian)
summary(notC.out)


#fourth sample of this
notD <- health[health$k %in% c("A", "B", "C", "E", "F", "G", "H", "I", "J"), ]
notD

notD.out<- glm(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita  + WHOGini1997 + log_Educational_Attainment + log_vanhanen +  any_89_Endrng_rivalry1 + POLITY96_97, family=gaussian)
summary(notD.out)

#fifth sample of this
notE <- health[health$k %in% c("A", "B", "C", "D", "F", "G", "H", "I", "J"), ]
notE

notE.out<- glm(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita  + WHOGini1997 + log_Educational_Attainment + log_vanhanen +  any_89_Endrng_rivalry1 + POLITY96_97, family=gaussian)
summary(notE.out)

#sixth sample of this
notF <- health[health$k %in% c("A", "B", "C", "D", "E", "G", "H", "I", "J"), ]
notF

notF.out<- glm(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita  + WHOGini1997 + log_Educational_Attainment + log_vanhanen +  any_89_Endrng_rivalry1 + POLITY96_97, family=gaussian)
summary(notF.out)

#seventh sample of this
notG <- health[health$k %in% c("A", "B", "C", "D", "E", "F", "H", "I", "J"), ]
notG

notG.out<- glm(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita  + WHOGini1997 + log_Educational_Attainment + log_vanhanen +  any_89_Endrng_rivalry1 + POLITY96_97, family=gaussian)
summary(notG.out)

#eigth sample of this
notH <- health[health$k %in% c("A", "B", "C", "D", "E", "F", "G", "I", "J"), ]
notH

#ninth sample of this
notI <- health[health$k %in% c("A", "B", "C", "D", "E", "F", "G", "H", "J"), ]
notI

notI.out<- glm(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita  + WHOGini1997 + log_Educational_Attainment + log_vanhanen +  any_89_Endrng_rivalry1 + POLITY96_97, family=gaussian)
summary(notI.out)

#tenth sample of this
notJ <- health[health$k %in% c("A", "B", "C", "D", "E", "F", "G", "H", "I"), ]
notJ


notJ.out<- glm(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita  + WHOGini1997 + log_Educational_Attainment + log_vanhanen +  any_89_Endrng_rivalry1 + POLITY96_97, family=gaussian)
summary(notJ.out)

library(apsrtable)
apsrtable(notJ.out, notI.out)



###Second Model: Total health expenditures### WORKS! #
attach(health)
health$total98log<- as.numeric(X._K_total98_log)

attach(health)
lm.out2<- lm(as.numeric(total98log)~ cia_Log_GDP_PerCapita + log_Educational_Attainment + prvt_exp_rcnt_gdp + pub_98_gdp_prcnt)
summary(lm.out2)

#coefplot

coefplot(lm.out2, var.idx=NULL, varnames=c("I", "GDP per capita", "Educ", "Private Spending", "Public Spending"), 
            CI=2, vertical=TRUE,
            v.axis=TRUE, h.axis=TRUE, 
            cex.var=0.8, cex.pts=0.9, 
            col.pts=1, pch.pts=20, var.las=2, 
            main="DV: Total Spending on Health", xlab=NULL, ylab=NULL, 
            plot=TRUE, add=FALSE, offset=.1,
            mar=c(1,6,5.1,2))

lm.out2dem<- lm(as.numeric(total98log)~ cia_Log_GDP_PerCapita + log_Educational_Attainment + prvt_exp_rcnt_gdp + pub_98_gdp_prcnt + POLITY96_97)
summary(lm.out2dem)


coefplot(lm.out2dem, var.idx=NULL, varnames=NULL, 
            CI=2, vertical=TRUE,
            v.axis=TRUE, h.axis=TRUE, 
            cex.var=0.8, cex.pts=0.9, 
            col.pts=1, pch.pts=20, var.las=2, 
            main="Total Spending on Health", xlab=NULL, ylab=NULL, 
            plot=TRUE, add=FALSE, offset=.1,
            mar=c(1,8,5.1,2))


xtable(lm.out2, digits=3)

###Third Model: HALE###  WORKS!! #

attach(health)
lm.out3<- lm(HALE_2000~ as.numeric(total98log) + GROWTH_URBAN9095_POP_UN + WHOGini1997 + log_vanhanen + log_Educational_Attainment +  deathn91_97 + contig_civil_war)
summary(lm.out3)


coefplot(lm.out3, var.idx=NULL, varnames=c("I", "Total Spending", "Urban Growth", "Gini", "ELF", "Educ", "Deaths 91-97", "Contig War"), 
            CI=2, vertical=TRUE,
            v.axis=TRUE, h.axis=TRUE, 
            cex.var=0.8, cex.pts=0.9, 
            col.pts=1, pch.pts=20, var.las=2, 
            main="DV: HALE", xlab=NULL, ylab=NULL, 
            plot=TRUE, add=FALSE, offset=.1,
            mar=c(1,6,5.1,2))


#plotting fitted vs. observed
#missing an observation for Monaco

cbind(GROWTH_URBAN9095_POP_UN, as.factor(COUNTRY_NAME))

health[is.na(health$GROWTH_URBAN9095_POP_UN),] #164

#making a new dataset without the missing #162
newhealth<- health[!is.na(health$GROWTH_URBAN9095_POP_UN),]

attach(newhealth)
lm.out3<- lm(HALE_2000~ as.numeric(total98log) + GROWTH_URBAN9095_POP_UN + WHOGini1997 + log_vanhanen + log_Educational_Attainment +  deathn91_97 + contig_civil_war)
summary(lm.out3)

yhat<- lm.out3$fitted.values
y<- HALE_2000

assign.cols <- ifelse(WHOregion==1,"gray","black")
plot(y, yhat, pch=18, col=assign.cols, xlim=c(25,80), ylim=c(25,80), ylab="Predicted Values", xlab="HALE 2000 (observed)", main="Fitted vs. Observed Values of HALE")
abline(0,1)


legend(x="bottomright",col=c("gray", "black"), pch=18,
legend=c("Africa", "All else"))


#getting the fitted vs. observed for the first model
m1health<- health[!is.na(health$cia_Log_GDP_PerCapita),]

#running model 1 without the missing variable
attach(m1health)
lm.out<- lm(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita  + WHOGini1997 + log_Educational_Attainment + log_vanhanen +  any_89_Endrng_rivalry1 + POLITY96_97)
summary(lm.out)

yhat1<- lm.out$fitted.values
y1<-pub_exp_prcnt_gdp97who
plot(y1, yhat1, col=WHOregion, xlim=c(0,9), ylim=c(0,9), pch=18, ylab="Predicted Values", xlab="Health Spending as % of GDP (observed)", main="Fitted vs. Observed Values of Health Spending")
abline(0,1)


legend(x="bottomright",col=c("black","red", "green", "blue", "cyan"), pch=18,
legend=c("Africa","Americas", "MidEast", "Europe", "AsiaPac"))



#adding in democracy which doesn't do much
lm.out3dem<- lm(HALE_2000~ as.numeric(total98log) + GROWTH_URBAN9095_POP_UN + WHOGini1997 + log_vanhanen + log_Educational_Attainment +  deathn91_97 + contig_civil_war +  POLITY96_97)
summary(lm.out3dem)

#coefplot with democracy added
coefplot(lm.out3dem, var.idx=NULL, varnames=NULL, 
            CI=2, vertical=TRUE,
            v.axis=TRUE, h.axis=TRUE, 
            cex.var=0.8, cex.pts=0.9, 
            col.pts=3, pch.pts=20, var.las=2, 
            main="HALE", xlab=NULL, ylab=NULL, 
            plot=TRUE, add=FALSE, offset=.1,
            mar=c(1,8,5.1,2))


###############################cross validation for model 3
#draw randomly from dataset
Rand<- runif(nrow(health))
Rand



# This should help:
Quintiles <- quantile(Rand, 0:10/10)
#cut random number into those four groups
health$k<-cut(Rand, Quintiles, include.lowest=T)

table(health$k)

#relabel the factor levels
health$k<- factor(health$k, labels=LETTERS[1:10])

attach(health)

EmptyObject<- NULL


#make it so it's not part of the regression
for(kk in LETTERS[1:10]){
Model<- lm(HALE_2000~ as.numeric(total98log) + GROWTH_URBAN9095_POP_UN + WHOGini1997 + log_vanhanen + log_Educational_Attainment +  deathn91_97 + contig_civil_war, data = health[health$k != kk, ])


Intercept<- Model$coef[1]
ISE<- summary(Model)$coef[1,2]
urban<- Model$coef[2]
urbanSE<- summary(Model)$coef[2,2]
gini<- Model$coef[3]
giniSE<- summary(Model)$coef[3,2]
vanhanen<- Model$coef[4]
vanhanenSE<- summary(Model)$coef[4,2]
ed<- Model$coef[5]
edSE<- summary(Model)$coef[5,2]
deaths<- Model$coef[6]
deathsSE<- summary(Model)$coef[6,2]
civilwar<- Model$coef[7]
civilwarSE<- summary(Model)$coef[7,2]




Yhat <- predict(Model, newdata = health[health$k == kk, ])
Y <- health[health$k == kk, "HALE_2000"]

RMSE<- sqrt(mean((Yhat - Y) ^ 2, na.rm = T))

#error squared, mean and take the square root, squaring it gets it back
#into the original units of  the dependent variable
#it's the loss function that was used to estimate the model

NewData<- c(kk, Intercept, ISE, urban, urbanSE, gini, giniSE, vanhanen, vanhanenSE, ed, edSE, deaths, deathsSE, civilwar, civilwarSE, RMSE)
EmptyObject <- rbind(EmptyObject,NewData)
#iterating through the model will add a row beneath what is there
}

warnings()
#change names
EmptyObject<- data.frame(EmptyObject)
EmptyObject

colnames(EmptyObject) <- c("k", "Intercept", "ISE", "gdp","gdpSE", "gini","giniSE", "ed", "edSE", "vanhanen","vanhanenSE", "deaths","deathsSE", "civilwar", "civilwarSE", "RMSE")

EmptyObject$Intercept<- as.numeric(as.character(EmptyObject$Intercept))
EmptyObject$gdp<- as.numeric(as.character(EmptyObject$gdp))
EmptyObject$gini<- as.numeric(as.character(EmptyObject$gini))
EmptyObject$ed<- as.numeric(as.character(EmptyObject$ed))
EmptyObject$vanhanen<- as.numeric(as.character(EmptyObject$vanhanen))
EmptyObject$rivalry<- as.numeric(as.character(EmptyObject$rivalry))
EmptyObject$polity<- as.numeric(as.character(EmptyObject$polity))
EmptyObject$SE<- as.numeric(as.character(EmptyObject$SE))
EmptyObject$RSME<- as.numeric(as.character(EmptyObject$RSME))


#summarize data and find six scnerios put in a vector
summary(POLITY96_97)
scenarios<- c(-10.000,-4.000,6.000,2.945,10.000,10.000)

#make a scenario x
6by2 times 2by100
scenarioX<- cbind(1, 1,1,1,1,1,1,1, scenarios)

#take the betas and vcov of model
betas<- lm.out3dem$coefficients
vcv<- vcov(lm.out3dem)

6by2 and 7 by 100
#scenarios times the transpose of the multivariate draws
yhats<- scenarioX%*%t(mvrnorm(100, betas, vcv))


longY<- melt(yhats)

head(longY)

#I want to break this apart, too
LilPlot <- qplot(data = longY, x = value, geom = "density", colour = as.factor(X1))
LilPlot <- LilPlot + facet_grid(X1 ~ .)
print(LilPlot)


cor(HALE_2000, pub_exp_prcnt_gdp97who)

library(calibrate)

#plot with text and hale vs total spending
plot(total98log, HALE_2000, col=WHOregion, pch=19, main="HALE and Total Spending", ylab="HALE 2000", xlab="log Total Spending 1998")

textxy(total98log, HALE_2000, ID)
abline(lm(HALE_2000~total98log))

legend(x="bottomright",col=c("black","red", "green", "blue", "cyan"), pch=19,
legend=c("Africa","Americas", "MidEast", "Europe", "AsiaPac"))

##HALE vs.gdp per capita - the Preston Curve
plot(cia_Log_GDP_PerCapita, HALE_2000,  pch=19, main="Preston Curve", ylab="HALE 2000", xlab="GDP per capita", col=WHOregion)
textxy((cia_Log_GDP_PerCapita, na.rm=T), HALE_2000, ID)
abline(lm(HALE_2000~cia_Log_GDP_PerCapita))


legend(x="bottomright",col=c("black","red", "green", "blue", "cyan"), pch=19,
legend=c("Africa","Americas", "MidEast", "Europe", "AsiaPac"))

apsrtable(lm.out3)
xtable(lm.out3, digits=3)


############
NEW STUFF
#############



#making a FH average score

attach(health)
health$FH_status<- (as.numeric(FH_Civil_Liberties_Index97) + as.numeric(Freedom_House_Politcal_Rights_In))
attach(health)

#average FH score
health$FHavg<- (health$FH_status)/2

#make it so it ranges from bad to good like polity
FHrescale<- 8-(health$FHavg)
attach(health)

#checking to see whether the fix has been made
plot(FHrescale, HALE_2000)
abline(lm(HALE_2000~ FHrescale))

#rescale FHavg so it ranges between 0 and 10
scalar<- 10/7

health$FHrescale<- FHrescale*scalar

#rescaling polity so it ranges between 0 and 10

polity10<- (POLITY96_97)/2
polity10<- 5+polity10

#making sure that the new variables have the same positive correlation
cor(FHavg, POLITY96_97)
cor(FHrescale, polity10)

#now add them together and make it out of 20
quality<- (FHrescale+polity10)/2

plot(quality, HALE_2000)
abline(lm(HALE_2000~quality))


#swapping POLITY for quality
fh.out<- lm(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita  + WHOGini1997 + log_Educational_Attainment + log_vanhanen +  any_89_Endrng_rivalry1 + quality)
summary(fh.out)

coefplot(fh.out, var.idx=NULL, varnames=NULL, 
            CI=2, vertical=TRUE,
            v.axis=TRUE, h.axis=TRUE, 
            cex.var=0.8, cex.pts=0.9, 
            col.pts=1, pch.pts=20, var.las=2, 
            main="Model 1.1", xlab=NULL, ylab=NULL, 
            plot=TRUE, add=FALSE, offset=.1,
            mar=c(1,8,5.1,2))


#putting in quality in equation two
total.out<- lm(as.numeric(total98log)~ cia_Log_GDP_PerCapita + log_Educational_Attainment + prvt_exp_rcnt_gdp + pub_98_gdp_prcnt + quality)
summary(total.out)


coefplot(total.out, var.idx=NULL, varnames=NULL, 
            CI=2, vertical=TRUE,
            v.axis=TRUE, h.axis=TRUE, 
            cex.var=0.8, cex.pts=0.9, 
            col.pts=1, pch.pts=20, var.las=2, 
            main="Model 2.1", xlab=NULL, ylab=NULL, 
            plot=TRUE, add=FALSE, offset=.1,
            mar=c(1,8,5.1,2))


attach(health)
#adding  in quality to HALE equation
hale.out<- lm(HALE_2000~ as.numeric(total98log) + GROWTH_URBAN9095_POP_UN + WHOGini1997 + log_vanhanen + log_Educational_Attainment +  deathn91_97 + contig_civil_war +  quality)
summary(hale.out)



coefplot(hale.out, var.idx=NULL, varnames=NULL, 
            CI=2, vertical=TRUE,
            v.axis=TRUE, h.axis=TRUE, 
            cex.var=0.8, cex.pts=0.9, 
            col.pts=1, pch.pts=20, var.las=2, 
            main="Model 3.1", xlab=NULL, ylab=NULL, 
            plot=TRUE, add=FALSE, offset=.1,
            mar=c(1,8,5.1,2))


cor(FHavg,POLITY96_97)

#trying gls with quality

attach(health)
gls.qual.fit<- gls(pub_exp_prcnt_gdp97who~ cia_Log_GDP_PerCapita  + WHOGini1997 + log_Educational_Attainment + log_vanhanen +  any_89_Endrng_rivalry1 + quality,
correlation=corAR1(.99),
method='ML',na.action=na.omit)
summary(gls.qual.fit)

toLatex(mtable(gls.qual.fit))


